
********************************************************************************
********************************************************************************
** load, merge in diagnsoes, and de-duplicate 
********************************************************************************
********************************************************************************
import delimited using health/rdrp3367_inp_de_v5.csv, clear asdouble

	bysort sid encounter_de : assert _n==1
	duplicates tag sid admit_time, gen(dupe)
	
	count if dupe == 0
	scalarout using "scratch/inpatient_duplicates.csv", ///
		id("non-duplicate admissions ") num(`=r(N)')  replace
	count if dupe~=0
	assert r(N) > 0
	scalarout using "scratch/inpatient_duplicates.csv", ///
		id("duplicate admissions ") num(`=r(N)')  
	

	** merge in dx info 
	merge m:1 sid encounter_de using health/dxs, keep(1 3) 
	gen no_dx = _merge ==1
	assert no_dx == missing(dx_code)

	sort sid admit_time no_dx	
	by sid admit_time: gen tt =  ~(dx_code == dx_code[1] | no_dx) ///
		& ~(dx_sys_id_text[1]~= dx_sys_id_text)
	by sid admit_time: egen trouble = max(tt)
	count if trouble
	assert r(N) == 122
	scalarout using "scratch/inpatient_duplicates.csv", ///
		id("conflicting first DX") num(`=r(N)')  
	drop tt trouble
	
	** which observations to keep? 
	* priority within pat-time: keep if only obs with icd dx, keep if most icd dx 
	gen no_discharge = missing(discharge_time)
	drop _merge
	gen not_icd = dx_sys_id_text == "PRVDH"
	sort sid admit_time no_dx no_discharge not_icd n_dx
	by sid admit_time no_dx no_discharge not_icd n_dx: gen tt = _n~=1 & ~no_dx
	by sid admit_time: egen random_keep = max(tt)
	by sid admit_time: keep if _n ==1
	count 
	count if random_keep
	scalarout using "scratch/inpatient_duplicates.csv", ///
		id("random keep ") num(`=r(N)')  

	
********************************************************************************
********************************************************************************
** clean up date/times 
********************************************************************************
********************************************************************************
	
	gen admit_date = date(substr(admit_time,1,9), "DMY", 2050)
	gen discharge_date = date(substr(discharge_time,1,9), "DMY", 2050)
	gen los = discharge_date - admit_date + 1

	
********************************************************************************
********************************************************************************
** merge in test information
********************************************************************************
********************************************************************************
	
	** a. make a data set with the number of hospitalizations per patient 
	sort sid admit_date 
	by sid: gen admit_id = _n
	by sid: gen n_hospitalizations = _N 
	
	save health/hospitalizations, replace 
	
	by sid: keep if _n==1
	keep sid n_hospitalizations
	save health/n_hospitalizations, replace 
	
	** b. Count tests/positives from daily panel 
	use health/test_daily_panel, clear
	merge m:1 sid using health/n_hospitalizations, keep(2 3) nogen

	expand n_hospitalizations
	sort sid date
	by sid date: gen admit_id = _n
	by sid date: assert admit_id[_N] == n_hospitalizations 
			
	merge m:1 sid admit_id using  health/hospitalizations, assert(3) ///
		keep(1 3) nogen keepusing(sid admit_id admit_date any_icli any_major no_dx) 
		
	sort sid admit_id date 
	gen time_since = date-admit_date
	
	
	qui foreach r in tight wide vtight late {
		
			if "`r'" == "tight" {
				local pre = -5
				local post = 1
			}
			
			if "`r'" == "late" {
				local pre = -2
				local post = 4
			}
			
			if "`r'" == "wide" {
				local pre = -7
				local post = 7
			}
			if "`r'" == "vtight" {
				local pre = -2
				local post = 0
			}		
		
			gen test_`r' = inrange(time_since, `pre', `post')
			gen pos_`r' = positive if test_`r' 	
	}
	
	
	by sid admit_id: egen first_date = min(date)
	by sid admit_id: egen first_pos = min(date) if positive
	assert ~missing(admit_date)
	gen test_pre = first_date <= admit_date - 8
	gen pos_pre = first_pos <= admit_date - 8 
	
	save health/test_hospitalized_panel, replace
	
	collapse (max) test_* pos_* , by(sid admit_id)		
	save health/hospitalized_tests , replace 
	
	
	** c. merge back in test data 	
	use health/hospitalizations
	merge 1:1 sid admit_id using health/hospitalized_tests, assert(3) nogen
		
	foreach v of varlist test_tight test_wide test_vtight test_pre test_late  {
		assert ~missing(`v')
	}
	
	replace pos_tight = 0 if test_tight & missing(pos_tight)
	replace pos_wide = 0 if test_wide & missing(pos_wide)
	replace pos_vtight = 0 if test_vtight & missing(pos_vtight)
	replace pos_pre = 0 if test_pre & missing(pos_pre)
	count if test_pre 
	assert r(N) == 198566
	count if pos_pre == 1 
	assert r(N) == 24058
	assert _N == 981453
	
	
label var test_tight "Tested from admit -2 to admit+4"
label var test_wide  "Tested from admit - 7 to admit+7"
label var pos_tight  "Positive test from admit - 2 to admit + 4"
label var pos_wide   "Positive test from admit - 7 to admit + 7"

label var test_pre "Tested at least 8 days before admit"
label var pos_pre "Positive at least before admit (if test pre)"



********************************************************************************
********************************************************************************
** clean up 
********************************************************************************
********************************************************************************
	drop dupe tt 	
	label var sid "patient ID"
	label var encounter_de "Encounter ID"
	label var inst_id_de "Admitting institution ID"
	label var no_dx "Missing diagnosis information" 
	label var random_keep "Admit kept by randomly breaking tie"
	rename in_other_cohort in_test 
	replace in_test = 0 if missing(in_test)
	label var in_test "Patient has at least one covid test"

	save health/inp_clean, replace 

** make patient level vesion
	use health/inp_clean, clear
	sort sid no_dx 
	by sid: gen n_admit = _N
	by sid: egen tt = sum(no_dx)
	gen n_admit_dx = n_admit-tt
	drop tt
	by sid: egen ever_icli = max(any_icli==1)
	by sid: egen ever_not_icli = max(any_icli==0)
	by sid: egen ever_clear = max(any_major==1)

	assert ever_icli if any_icli == 1
	assert ever_not_icli if any_icli== 0
	
	by sid: assert ~ever_icli[1] & ~ever_not_icli[1] if no_dx[1]
	by sid: keep if _n==1
	keep sid n_admit n_admit_dx no_dx ever_icli ever_not_icli ever_clear
	rename no_dx no_dx_ever 
	label var n_admit "# admissions"
	label var n_admit_dx "# admissions with DX"
	label var no_dx_ever "No admissions with DX"
	save health/inp_person_level, replace 

	
	
